from google.colab import drive
drive.mount('/content/drive/')
#Set your project path & file name
# project_path = '/content/drive/My Drive/ML_Project/Capstone/'
project_path = '/content/drive/My Drive/Capstone_NLP/'
file_name ='IT-Ticket-Classification.xlsx'
import pandas as pd
import numpy as np
import re
import sys
itTicketDF=pd.read_excel(project_path+file_name,encoding=sys.getfilesystemencoding())
itTicketDF.head()
callers = itTicketDF['Caller'].unique()
callers.shape
itTicketDF.drop(["Caller",'Short description'],axis=1,inplace= True)
itTicketDF.info()
targetClassCnt=itTicketDF['Assignment group'].value_counts()
targetClassCnt.describe()
sample = itTicketDF.groupby(['Assignment group'])
regroup=[]
for grp in itTicketDF['Assignment group'].unique():
if(sample.get_group(grp).shape[0]<10):
regroup.append(grp)
print('Found {} groups which have under 10 samples'.format(len(regroup)))
itTicketDF['Assignment group']=itTicketDF['Assignment group'].apply(lambda x : 'misc_grp' if x in regroup else x)
# Unique Groups check
itTicketDF['Assignment group'].unique()
##Pre-Processing label Encoding on Assignment Group
import seaborn as sns
import matplotlib.pyplot as plt
#add to remove warning for python 3.6 dependency
import warnings
import pandas.util.testing as tm
plt.style.use('ggplot')
%matplotlib inline
descending_order = itTicketDF['Assignment group'].value_counts().sort_values(ascending=False).index
plt.subplots(figsize=(22,5))
#added code for x label rotate
ax=sns.countplot(x='Assignment group', data=itTicketDF, color='royalblue',order=descending_order)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.tight_layout()
plt.show()
#Fill 'Decsription' field entries that have null with blank space
itTicketDF['Description'].fillna(value=' ', inplace=True)
from dateutil import parser
def is_valid_date(date_str):
try:
parser.parse(date_str)
return True
except:
return False
def clean_data(text):
text=text.lower()
text = ' '.join([w for w in text.split() if not is_valid_date(w)])
text = re.sub(r"received from:",' ',text)
text = re.sub(r"from:",' ',text)
text = re.sub(r"to:",' ',text)
text = re.sub(r"subject:",' ',text)
text = re.sub(r"sent:",' ',text)
text = re.sub(r"ic:",' ',text)
text = re.sub(r"cc:",' ',text)
text = re.sub(r"bcc:",' ',text)
#Remove email
text = re.sub(r'\S*@\S*\s?', '', text)
# Remove numbers
text = re.sub(r'\d+','' ,text)
# Remove Non Dictionary character-TODO
# Remove new line characters
text = re.sub(r'\n',' ',text)
# Remove hashtag while keeping hashtag text
text = re.sub(r'#','', text)
#&
text = re.sub(r'&;?', 'and',text)
# Remove HTML special entities (e.g. &)
text = re.sub(r'\&\w*;', '', text)
# Remove hyperlinks
text = re.sub(r'https?:\/\/.*\/\w*', '', text)
# Remove characters beyond Readable formart by Unicode:
text= ''.join(c for c in text if c <= '\uFFFF')
text = text.strip()
# Remove unreadable characters (also extra spaces)
text = ' '.join(re.sub("[^\u0030-\u0039\u0041-\u005a\u0061-\u007a]", " ", text).split())
for name in callers:
namelist = [part for part in name.split()]
for namepart in namelist:
text = text.replace(namepart,'')
text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)
text = re.sub(' +', ' ', text)
text = text.strip()
return text
itTicketDF['Description'] = itTicketDF['Description'].apply(clean_data)
itTicketDF['length']=[len(text) for text in itTicketDF['Description']]
itTicketDF=itTicketDF[itTicketDF['length']>=3]
itTicketDF['Description'] = itTicketDF['Description'].apply(lambda x : " ".join([word for word in x.split() if(len(word)>2)]))
#itTicketDF.drop(['length'],axis=1,inplace= True)
germanwordlist = ['bitte','nicht','konto','probleme','berechtigung','defekt','mehr','ausgetauscht','rechner', 'drucker','teilweise','freigegeben','genannten','anmeldeaccount',
'besprochen','werden','durchwahl','oben','einrichten','zeitwirtschaft','seit','morgens','beheben','keine','zeitbuchungen','vorhanden','dringend','fehler',
'werk','anmelde','auftrag','kein','skannen','freundlichen','werkzeuge,','hartstoffe','maste','schutzw','fertigung','immer','sehr','zugriff','freundliche',
'geehrter','souzarft','noch','verbindungsherstellung','meldung','erneuten','glich','proben','beilageproben','beilage','auswerten','sinterleitstand','reparar',
'reparo','rechner','koenigsee','entregar','atualiza','declara','programdntya','funcionando','preciso','hitacni','grergtger','zugriffsrechte','teamleiter',
'abholen','wegen','weit','absender','wenn','abrechnung']
pattern = '|'.join(germanwordlist)
pattern
germanDescIndex = itTicketDF[itTicketDF['Description'].str.contains(pattern)].index
germanDescIndex
# since translation done by googletrans & textblob is giving error while using checking & translating every row, just checking the "Description" for german word like 'bitte'
#Install google translator by pip if not installed
!pip install -q googletrans
from googletrans import Translator
def translateIfRequired(x):
translator = Translator()
if translator.detect(x).lang != 'en':
translatedText = translator.translate(x).text
else:
translatedText = x
return translatedText
for rowId in germanDescIndex:
itTicketDF['Description'][rowId] = translateIfRequired(itTicketDF['Description'][rowId])
germanDescIndex_new = itTicketDF[itTicketDF['Description'].str.contains(pattern)].index
germanDescIndex_new
itTicketDF['Description'][8232]
itTicketDF.reset_index(drop=True,inplace=True)
itTicketDF['Description'] = itTicketDF['Description'].str.strip()
itTicketDF['Description'] = itTicketDF['Description'].str.lower()
from collections import OrderedDict
itTicketDF['Description']=itTicketDF['Description'].str.split().apply(lambda x:OrderedDict.fromkeys(x).keys()).str.join(' ')
itTicketDF.info()
from nltk.corpus import stopwords
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
sr = stopwords.words('english')
for i,text in enumerate(itTicketDF['Description']):
itTicketDF['Description'][i]=" ".join(word for word in text.split(' ') if word not in sr)
# install spacy and plt for gensim
!pip install -q spacy
import spacy
nlp = spacy.load('en', disable=['parser', 'ner'])
allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
def lemmatize_text(text):
doc = nlp(text)
return ' '.join([token.lemma_ for token in doc])
itTicketDF['Description'] = itTicketDF['Description'].apply(lemmatize_text)
itTicketDF['Description'][8232]
# !pip install pyspellchecker
# from spellchecker import SpellChecker
# def correctSpelling(text):
# spell = SpellChecker()
# spell.word_frequency.load_words(['netweaver','-pron-','caas','hostname','unmonitored','activesync','audiocode','isp','apac','skype','callie','worklist','gsc','verizon',
# 'firewall','wifi','downloader','uploaded','crm','mms','iphone','rgds','thx','inbox','prefetch','mbps','website','www','inplant','openstage',
# 'costcenter','undeliverable','svC','emea','html','reinstall','upload','jpg','signout','malware','cvss','rar','xls','uplink','ziped','uncaught',
# 'xlsx','wlan','webpage','uploader','firefox','callback','acct','abap','svchost','webserviceclient','wlc','sid'])
# wordlist = [word for word in text.split()]
# misspelled = spell.unknown(wordlist)
# for incorrect in misspelled:
# correct = spell.correction(incorrect)
# text = text.replace(incorrect, correct)
# return text
# itTicketDF['Description'] = itTicketDF['Description'].apply(correctSpelling)
!pip install -q pyLDAvis
# Gensim
import gensim
import gensim.corpora as corpora
#Remove stemming(snowball stemming) add lemmatistaion using simple_process from gensim
from gensim.utils import simple_preprocess
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
# spacy for lemmatization
import spacy
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim # don't skip this
warnings.filterwarnings("ignore",category=DeprecationWarning)
#to process the simple_process gensim package as input needed as string
combined_text=itTicketDF.Description.values.tolist()
combined_text[1]
#Convert Combined text from each sentense to the words. use of simple_process as it tokenize() internally
#https://radimrehurek.com/gensim/utils.html#gensim.utils.simple_preprocess
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
data_words = list(sent_to_words(combined_text))
print(data_words[1])
# Build the bigram and trigram models
#https://radimrehurek.com/gensim/models/phrases.html
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
print(bigram_mod[data_words[1]])
# See trigram example
print(trigram_mod[bigram_mod[data_words[1]]])
def make_bigrams(texts):
return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
return [trigram_mod[bigram_mod[doc]] for doc in texts]
# Form Bigrams
data_words_bigrams = make_bigrams(data_words)
print(data_words_bigrams[1])
from wordcloud import WordCloud
import matplotlib.pyplot as plt
wordclouds=' '.join(map(str, data_words_bigrams))
wordcloud = WordCloud(width=480, height=480, max_font_size=20, min_font_size=10).generate(wordclouds)
plt.figure(figsize=(20,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()
#check for the word count 100)
wordcloud_2 = WordCloud(width=480, height=480, max_words=100).generate(wordclouds)
plt.figure(figsize=(10,10))
plt.imshow(wordcloud_2, interpolation="bilinear")
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()
#Copying to new dataframe to create wordclouds on target class
new_df = itTicketDF.copy()
new_df['words'] = data_words_bigrams
new_df
#Sorting based on frequency of target class Assignment group
value = new_df['Assignment group'].value_counts().sort_values(ascending=False).index
value
# Creating a function for wordcloud
def wordcloud_grp(f, x):
wordclouds_0=' '.join(map(str, f))
wc = WordCloud(width=480, height=480, max_font_size=20, min_font_size=10, max_words=50).generate(wordclouds_0)
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title("Most common 50 words of {}".format(x))
plt.margins(x=0, y=0)
plt.show()
#for loop to pass the top 50 Assignment groups
for i in range(50):
Grp = new_df[new_df ['Assignment group'] == value[i]]
Grp = Grp['words']
wordcloud_grp(Grp,value[i])
# Create Dictionary
id2word = corpora.Dictionary(data_words_bigrams)
# Create Corpus from post clean data
texts = data_words_bigrams
# Term Document Frequency and Bag of words
#https://radimrehurek.com/gensim/corpora/dictionary.html
corpus = [id2word.doc2bow(text) for text in texts]
# View not human readable
print(corpus[:1])
#possible human redable
print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]])
Bag of word model is working with bi gram and trigram model getting more insight from the words and its frequencey in document 1 .
# Build LDA model
#https://radimrehurek.com/gensim/models/ldamodel.html
lda_model = LdaModel(corpus=corpus,id2word=id2word,num_topics=7,random_state=200,update_every=1,chunksize=800,passes=10,alpha='auto',per_word_topics=True)
#top 7 topics from the corpus
from pprint import pprint
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]
texts=data_words_bigrams
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus)) # a measure of how good the model is. lower the better.
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis
# add a deep copy to another dataframe itTicketDF to other DF
data = itTicketDF.copy()
data['num_words'] = data.Description.apply(lambda x : len(x.split()))
data.describe().transpose()
bins=[0,50,75, np.inf]
data['bins']=pd.cut(data.num_words, bins=[0,100,300,500,800, np.inf], labels=['0-100', '100-300', '300-500','500-800' ,'>800'])
word_distribution = data.groupby('bins').size().reset_index().rename(columns={0:'counts'})
word_distribution
sns.barplot(x='bins', y='counts', data=word_distribution).set_title("Word distribution per bin")
Observation
# Create the Dataset for classifying GRP_0
itTicketGRP0DF = itTicketDF.copy()
itTicketGRP0DF['Assignment group']=itTicketGRP0DF['Assignment group'].apply(lambda x : 'other' if x != 'GRP_0' else x)
descending_order = itTicketGRP0DF['Assignment group'].value_counts().sort_values(ascending=False).index
plt.subplots(figsize=(5,5))
sns.countplot(x='Assignment group', data=itTicketGRP0DF, color='royalblue',order=descending_order)
#Create Dataset for 'others' i.e all groups which is not part of GRP_0
itTicketOthersDF = itTicketDF[itTicketDF['Assignment group'] != 'GRP_0']
descending_order = itTicketOthersDF['Assignment group'].value_counts().sort_values(ascending=False).index
plt.subplots(figsize=(22,5))
#add code to rotate the labels
ax=sns.countplot(x='Assignment group', data=itTicketOthersDF, color='royalblue',order=descending_order)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.tight_layout()
plt.show()
maxOthers = itTicketOthersDF['Assignment group'].value_counts().max()
maxOthers
# Treat the imbalnce in the 'other' dataset by resampling
from sklearn.utils import resample
itTicketOthersDF_resampled = itTicketOthersDF[0:0]
for grp in itTicketOthersDF['Assignment group'].unique():
itTicketGrpDF = itTicketOthersDF[itTicketOthersDF['Assignment group'] == grp]
resampled = resample(itTicketGrpDF, replace=True, n_samples=int(maxOthers/2), random_state=123)
itTicketOthersDF_resampled = itTicketOthersDF_resampled.append(resampled)
otherGrpsResampled = pd.concat([itTicketGRP0DF,itTicketOthersDF_resampled])
otherGrpsResampled.reset_index(inplace=True)
descending_order = itTicketOthersDF_resampled['Assignment group'].value_counts().sort_values(ascending=False).index
plt.subplots(figsize=(22,5))
#add code to rotate the labels
ax=sns.countplot(x='Assignment group', data=itTicketOthersDF_resampled, color='royalblue')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.tight_layout()
plt.show()
# Treat the imbalnce in the itTicketDF dataset by resampling to 661.This is for us to try creating a single model which use the whiole dataset & verify the performance
itTicketDF_resampled = itTicketDF[0:0]
for grp in itTicketDF['Assignment group'].unique():
itTicketGrpDF = itTicketDF[itTicketDF['Assignment group'] == grp]
resampled = resample(itTicketGrpDF, replace=True, n_samples=int(maxOthers), random_state=123)
itTicketDF_resampled = itTicketDF_resampled.append(resampled)
descending_order = itTicketDF_resampled['Assignment group'].value_counts().sort_values(ascending=False).index
plt.subplots(figsize=(22,5))
#add code to rotate the labels
ax=sns.countplot(x='Assignment group', data=itTicketDF_resampled, color='royalblue')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.tight_layout()
plt.show()
from sklearn import preprocessing
def labelencoder(dataframe) :
label_encoder = preprocessing.LabelEncoder()
dataframe= label_encoder.fit_transform(dataframe)
grp_mapping = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))
return dataframe,grp_mapping
#itTicketGRP0DF['Assignment group'] , grp_mapping_grp0= labelencoder(itTicketGRP0DF['Assignment group'])
#itTicketOthersDF_resampled['Assignment group'] , grp_mapping_others_resampled= labelencoder(itTicketOthersDF_resampled['Assignment group'])
otherGrpsResampled['Assignment group'] , grp_mapping_others_resampled= labelencoder(otherGrpsResampled['Assignment group'])
itTicketDF_resampled['Assignment group'] , grp_mapping_all_resampled= labelencoder(itTicketDF_resampled['Assignment group'])
itTicketDF['Assignment group'],grp_mapping_all_raw = labelencoder(itTicketDF['Assignment group'])
from gensim.models import Word2Vec
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Flatten, Bidirectional, GlobalMaxPool1D,GRU,Conv1D,MaxPooling1D
from tensorflow.keras.models import Model, Sequential
import tensorflow as tf
from sklearn import metrics
from tensorflow.keras import backend as K
import matplotlib.pyplot as plt
from tensorflow.keras.utils import plot_model
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Add Function to capture results from each model
import operator
def captureData(dataframe,modelHistory,modelName,descriptions,index_df,resetData):
if resetData == 1:
tempResultsDf=pd.DataFrame()
dataframe=pd.DataFrame()
else:
index, acc_value = max(enumerate(modelHistory.history['val_accuracy']), key=operator.itemgetter(1))
tempResultsDf= pd.DataFrame(
{'model':[modelName],
'val_accuracy': [acc_value],
'val_loss':[modelHistory.history['val_loss'][index]],
'loss':[modelHistory.history['loss'][index]],
'accuracy':[modelHistory.history['accuracy'][index]],
'descriptions':[descriptions]},index={str(index_df)})
dataframe = pd.concat([dataframe,tempResultsDf])
dataframe = dataframe[['model','val_accuracy' ,'val_loss','loss','accuracy','descriptions']]
return dataframe
def capturePrediction(dataframe,modelName,descriptions,index_df,pred_accuracy,resetData):
if resetData == 1:
tempResultsDf=pd.DataFrame()
dataframe=pd.DataFrame()
else:
tempResultsDf= pd.DataFrame(
{'model':[modelName],
'Pred_Accuracy' : [pred_accuracy],
'descriptions':[descriptions]},index={str(index_df)})
dataframe = pd.concat([dataframe,tempResultsDf])
dataframe = dataframe[['model','Pred_Accuracy','descriptions']]
return dataframe
sentences = [line.split(' ') for line in itTicketDF['Description']]
word2vec = Word2Vec(sentences=sentences,min_count=1)
word2vec.wv.save_word2vec_format(project_path+ 'word2vec_vector.txt')
# load the whole embedding into memory
embeddings_index = dict()
f = open(project_path+'word2vec_vector.txt')
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))
maxlen = 300
numWords=9000
epochs = 10
results=pd.DataFrame()
pred_results = pd.DataFrame()
class LstmModel:
model= Model()
X_test=[]
y_test=[]
embedding_matrix=[]
def wordTokenizer(self, dataframe):
tokenizer = Tokenizer(num_words=numWords,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=' ', char_level=False)
tokenizer.fit_on_texts(dataframe)
dataframe = tokenizer.texts_to_sequences(dataframe)
return tokenizer,dataframe
def splitData(self,X,y):
print("Number of Samples:", len(X))
print("Number of Labels: ", len(y))
X_train, self.X_test, y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=10) # changed by Abraham
X_train, X_Val, y_train, y_Val = train_test_split(X, y, test_size=0.2, random_state=10)
print("Number of train Samples:", len(X_train))
print("Number of val Samples:", len(X_Val))
return X_train, self.X_test, y_train, self.y_test, X_Val, y_Val
def tokenizeAndEmbedding(self,dataframe):
tokenizer,X = self.wordTokenizer(dataframe['Description'])
y = np.asarray(dataframe['Assignment group'])
X = pad_sequences(X, maxlen = maxlen)
self.embedding_matrix = np.zeros((numWords+1, 100))
for i,word in tokenizer.index_word.items():
if i<numWords+1:
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
self.embedding_matrix[i] = embedding_vector
return X,y
def train(self, dataframe, batch_size, epochs):
X,y = self.tokenizeAndEmbedding(dataframe)
X_train, _, y_train, _, X_Val, y_Val = self.splitData(X,y)
model_history = self.fitModel(X_train,y_train,X_Val,y_Val,batch_size, epochs)
return model_history
def fitModel(self,X_train,y_train,X_Val,y_Val,batch_size, epochs):
input_layer = Input(shape=(maxlen,),dtype=tf.int64)
embed = Embedding(numWords+1,output_dim=100,input_length=maxlen,weights=[self.embedding_matrix], trainable=True)(input_layer) #weights=[embedding_matrix]
lstm=Bidirectional(LSTM(128))(embed)
drop=Dropout(0.3)(lstm)
dense =Dense(100,activation='relu')(drop)
out=Dense(len((pd.Series(y_train)).unique()),activation='softmax')(dense)
self.model = Model(input_layer,out)
self.model.compile(loss='sparse_categorical_crossentropy',optimizer="adam",metrics=['accuracy'])
self.model.summary()
plot_model(self.model,to_file="LSTM_Model.jpg")
checkpoint = ModelCheckpoint('model-{epoch:03d}-{val_accuracy:03f}.h5', verbose=1, monitor='val_accuracy',save_best_only=True, mode='auto')
reduceLoss = ReduceLROnPlateau(monitor='val_loss', factor=0.2,patience=2, min_lr=0.0001)
model_history = self.model.fit(X_train,y_train,batch_size=batch_size, epochs=epochs, callbacks=[checkpoint,reduceLoss], validation_data=(X_Val,y_Val))
return model_history,self.model
def prediction(self):
pred = self.model.predict(self.X_test)
pred = [i.argmax() for i in pred]
accuracy = metrics.accuracy_score(self.y_test, pred)
print("Accuracy of the model :",metrics.accuracy_score(self.y_test, pred))
return accuracy
def plotModelAccuracy(self, history, modelname):
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title(modelname+' model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title(modelname+' model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()
# Check how the LSTM Model perform with the cleansed data
lstmModelRawData = LstmModel()
lstmModelRawData_history, model = lstmModelRawData.train(itTicketDF,100,epochs)
rawData_accuracy = lstmModelRawData.prediction()
lstmModelRawData.plotModelAccuracy(lstmModelRawData_history, 'All Data Unsampled LSTM')
# Check how the LSTM Model perform with all the data which is cleansed & resampled to 661 to make the target balance
lstmModelAllDataResampled = LstmModel()
lstmModelAllDataResampled_history, model = lstmModelAllDataResampled.train(itTicketDF_resampled,100,epochs)
resampled_accuracy = lstmModelAllDataResampled.prediction()
lstmModelAllDataResampled.plotModelAccuracy(lstmModelAllDataResampled_history, 'All Data Resampled LSTM')
results=pd.DataFrame()
pred_results = pd.DataFrame()
results=captureData(results,lstmModelRawData_history,'LSTM model_WV_rawdata','LSTM+Word2Vec Embedding on raw data','1',0)
pred_results= capturePrediction(pred_results,'LSTM model_WV_rawdata','LSTM+Word2Vec Embedding on raw data','1',rawData_accuracy,0)
results=captureData(results,lstmModelAllDataResampled_history,'LSTM model_WV_resampled data','LSTM+Word2Vec Embedding on Augmented data','2',0)
pred_results= capturePrediction(pred_results,'LSTM model_WV_resampled data','LSTM+Word2Vec Embedding on Augmented data','2',resampled_accuracy,0)
class TwoModel:
model_1 = Model()
model_2 = Model()
embedding_matrix=[]
def wordTokenizer(self, dataframe):
tokenizer = Tokenizer(num_words=numWords,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=' ', char_level=False)
tokenizer.fit_on_texts(dataframe)
dataframe = tokenizer.texts_to_sequences(dataframe)
self.embedding_matrix = np.zeros((numWords+1, 100))
for i,word in tokenizer.index_word.items():
if i<numWords+1:
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
self.embedding_matrix[i] = embedding_vector
return tokenizer,dataframe
def splitData(self,X,y):
print("Number of Samples:", len(X))
print("Number of Labels: ", len(y))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10) # changed by Abraham
X_train, X_Val, y_train, y_Val = train_test_split(X, y, test_size=0.2, random_state=10)
print("Number of train Samples:", len(X_train))
print("Number of val Samples:", len(X_Val))
return X_train, X_test, y_train, y_test, X_Val, y_Val
def runFirstModel(self,dataframe,epochs):
grp0_df = dataframe.copy()
grp0_df['Assignment group']=dataframe['Assignment group'].apply(lambda x : 1 if x != 0 else x)
tokenizer,X = self.wordTokenizer(grp0_df['Description'])
y = np.asarray(grp0_df['Assignment group'])
X = pad_sequences(X, maxlen = maxlen)
X_train, _, y_train, _, X_Val, y_Val = self.splitData(X,y)
model_history,self.model_1 = self.modelRunner(X_train,y_train,X_Val,y_Val,epochs)
return model_history,self.model_1
def runSecondModel(self, dataframe,epochs):
grpOthers_df = dataframe.copy()
grpOthers_df = grpOthers_df[grpOthers_df['Assignment group'] != 0]
grpOthers_df['Assignment group']=grpOthers_df['Assignment group'] - 1
tokenizer,X = self.wordTokenizer(grpOthers_df['Description'])
y = np.asarray(grpOthers_df['Assignment group'])
X = pad_sequences(X, maxlen = maxlen)
X_train, _, y_train, _, X_Val, y_Val = self.splitData(X,y)
model_history,self.model_2 = self.modelRunner(X_train,y_train,X_Val,y_Val,epochs)
return model_history,self.model_2
def modelRunner(self, X,Y,X_Val,Y_Val,epochs):
input_layer = Input(shape=(maxlen,),dtype=tf.int64)
embed = Embedding(input_dim = numWords+1,output_dim=100,input_length=maxlen,weights=[self.embedding_matrix], trainable=True)(input_layer) #
lstm=Bidirectional(LSTM(128))(embed)
drop=Dropout(0.3)(lstm)
dense =Dense(100,activation='relu')(drop)
out=Dense(len((pd.Series(Y)).unique()),activation='softmax')(dense)
batch_size = 100
model = Model(input_layer,out)
model.compile(loss='sparse_categorical_crossentropy',optimizer="adam",metrics=['accuracy'])
checkpoint = ModelCheckpoint('model-{epoch:03d}-{val_accuracy:03f}.h5', verbose=1, monitor='val_accuracy',save_best_only=True, mode='auto')
reduceLoss = ReduceLROnPlateau(monitor='val_loss', factor=0.2,patience=2, min_lr=0.0001)
model_history = model.fit(X,Y,batch_size=batch_size, epochs=epochs, callbacks=[checkpoint,reduceLoss], validation_data=(X_Val,Y_Val))
return model_history,model
def predict(self, X_test):
predBinary = self.model_1.predict(X_test)
predBinary = [1 if j>i else 0 for i,j in predBinary]
new_X_test = pd.DataFrame(X_test)
new_X_test['grp']=predBinary
sec_input = new_X_test[new_X_test['grp']!=0]
sec_input.drop(['grp'],inplace=True, axis=1)
new_X_test=new_X_test[new_X_test['grp']==0]
predOther = self.model_2.predict(sec_input)
predOther = [i.argmax() for i in predOther]
predOther= [i+1 for i in predOther]
sec_input['grp']=predOther
pred_df = pd.concat([new_X_test,sec_input])
pred_df.sort_index(axis=0,inplace=True)
return np.array(pred_df['grp'])
def plotModelAccuracy(self, history, modelname):
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title(modelname+' model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title(modelname+' model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()
model = TwoModel()
model1_history,_ = model.runFirstModel(otherGrpsResampled,5)
model2_history,_ = model.runSecondModel(otherGrpsResampled,20)
tokenizer,X = model.wordTokenizer(otherGrpsResampled['Description'])
y = np.asarray(otherGrpsResampled['Assignment group'])
X = pad_sequences(X, maxlen = maxlen)
_, X_test, _, y_test, _, _ = model.splitData(X,y)
predictions = model.predict(X_test)
twoModel_sampledAccuracy = metrics.accuracy_score(y_test, predictions)
print("Accuracy:",twoModel_sampledAccuracy)
model.plotModelAccuracy(model1_history, 'GRP0 vs Other')
model.plotModelAccuracy(model2_history, 'Other')
results=captureData(results,model1_history,'LSTM 2 part model_WV_grp0','LSTM+Word2Vec Embedding on grp0_data','3',0)
results=captureData(results,model2_history,'LSTM 2 part model_WV_Others','LSTM+Word2Vec Embedding on Rest of groups','4',0)
pred_results= capturePrediction(pred_results,'LSTM 2 part model_WV','LSTM+Word2Vec Embedding on Augmented data','3',twoModel_sampledAccuracy,0)
#download the glove embedding from https://nlp.stanford.edu/projects/glove/
#glove_file = "/content/drive/My Drive/ML_Project/NLP/NLP-1 Sentiment Classification/" + "glove.6B.zip"
glove_file = project_path + "glove.6B.zip"
print(glove_file)
#Extract Glove embedding zip file
from zipfile import ZipFile
with ZipFile(glove_file, 'r') as z:
z.extractall()
# EMBEDDING_FILE = './glove.6B.200d.txt'
EMBEDDING_FILE = './glove.6B.100d.txt'
embeddings_glove = {}
for o in open(EMBEDDING_FILE):
word = o.split(" ")[0]
#print(word)
embd = o.split(" ")[1:]
embd = np.asarray(embd, dtype='float32')
#print(embd)
embeddings_glove[word] = embd
print("Dimension of the embedding vector is {}".format(len(embeddings_glove["collaboration"])))
print("Dimension of the embedding vector is {}".format(len(embeddings_glove["platform"])))
maxlen = 300
numWords=9000
epochs = 10
class LstmGloveModel:
model= Model()
X_test=[]
y_test=[]
embedding_matrix=[]
def wordTokenizer(self, dataframe):
tokenizer = Tokenizer(num_words=numWords,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=' ', char_level=False)
tokenizer.fit_on_texts(dataframe)
dataframe = tokenizer.texts_to_sequences(dataframe)
return tokenizer,dataframe
def splitData(self,X,y):
print("Number of Samples:", len(X))
print("Number of Labels: ", len(y))
X_train, self.X_test, y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=10) # changed by Abraham
X_train, X_Val, y_train, y_Val = train_test_split(X, y, test_size=0.2, random_state=10)
print("Number of train Samples:", len(X_train))
print("Number of val Samples:", len(X_Val))
return X_train, self.X_test, y_train, self.y_test, X_Val, y_Val
def tokenizeAndEmbedding(self,dataframe):
tokenizer,X = self.wordTokenizer(dataframe['Description'])
y = np.asarray(dataframe['Assignment group'])
X = pad_sequences(X, maxlen = maxlen)
self.embedding_matrix = np.zeros((numWords+1, 100))
for i,word in tokenizer.index_word.items():
if i<numWords+1:
embedding_vector = embeddings_glove.get(word)
if embedding_vector is not None:
self.embedding_matrix[i] = embedding_vector
return X,y
def train(self, dataframe, batch_size, epochs):
X,y = self.tokenizeAndEmbedding(dataframe)
X_train, _, y_train, _, X_Val, y_Val = self.splitData(X,y)
model_history = self.fitModel(X_train,y_train,X_Val,y_Val,batch_size, epochs)
return model_history
def fitModel(self,X_train,y_train,X_Val,y_Val,batch_size, epochs):
input_layer = Input(shape=(maxlen,),dtype=tf.int64)
embed = Embedding(numWords+1,output_dim=100,input_length=maxlen,weights=[self.embedding_matrix], trainable=True)(input_layer) #weights=[embedding_matrix]
lstm=Bidirectional(LSTM(128))(embed)
drop=Dropout(0.3)(lstm)
dense =Dense(100,activation='relu')(drop)
out=Dense(len((pd.Series(y_train)).unique()),activation='softmax')(dense)
self.model = Model(input_layer,out)
self.model.compile(loss='sparse_categorical_crossentropy',optimizer="adam",metrics=['accuracy'])
checkpoint = ModelCheckpoint('model-{epoch:03d}-{val_accuracy:03f}.h5', verbose=1, monitor='val_accuracy',save_best_only=True, mode='auto')
reduceLoss = ReduceLROnPlateau(monitor='val_loss', factor=0.2,patience=2, min_lr=0.0001)
model_history = self.model.fit(X_train,y_train,batch_size=batch_size, epochs=epochs, callbacks=[checkpoint,reduceLoss], validation_data=(X_Val,y_Val))
return model_history,self.model
def prediction(self):
pred = self.model.predict(self.X_test)
pred = [i.argmax() for i in pred]
accuracy = metrics.accuracy_score(self.y_test, pred)
print("Accuracy of the model :",accuracy)
return accuracy
def plotModelAccuracy(self, history, modelname):
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title(modelname+' model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title(modelname+' model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()
# Check how the LSTM Model perform with the cleansed data
# Check how the LSTM Model perform with the cleansed data
lstmModelRawData = LstmGloveModel()
lstmModelRawData_history, model = lstmModelRawData.train(itTicketDF,100,epochs)
lstm_raw_accuracy = lstmModelRawData.prediction()
lstmModelRawData.plotModelAccuracy(lstmModelRawData_history, 'All Data Unsampled LSTM')
#Check how the LSTM Model perform with all the data which is cleansed & resampled to 661 to make the target balance
lstmModelAllDataResampled = LstmGloveModel()
lstmModelAllDataResampled_history, model = lstmModelAllDataResampled.train(itTicketDF_resampled,100,epochs)
lstm_allResampled_accuracy = lstmModelAllDataResampled.prediction()
lstmModelAllDataResampled.plotModelAccuracy(lstmModelAllDataResampled_history, 'All Data Resampled LSTM')
results=captureData(results,lstmModelRawData_history,'LSTM model_GloVe_rawdata','LSTM+GloVe Embedding on raw data','5',0)
pred_results= capturePrediction(pred_results,'LSTM model_GloVe_rawdata','LSTM+GloVe Embedding on raw data','4',lstm_raw_accuracy,0)
results=captureData(results,lstmModelAllDataResampled_history,'LSTM model_GloVe_resampled data','LSTM+GloVe Embedding on Augmented data','6',0)
pred_results= capturePrediction(pred_results,'LSTM model_GloVe_resampled data','LSTM+GloVe Embedding on Augmented data','5',lstm_allResampled_accuracy,0)
class GruGloveModel:
model= Model()
X_test=[]
y_test=[]
embedding_matrix=[]
def wordTokenizer(self, dataframe):
tokenizer = Tokenizer(num_words=numWords,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=' ', char_level=False)
tokenizer.fit_on_texts(dataframe)
dataframe = tokenizer.texts_to_sequences(dataframe)
return tokenizer,dataframe
def splitData(self,X,y):
print("Number of Samples:", len(X))
print("Number of Labels: ", len(y))
X_train, self.X_test, y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=10) # changed by Abraham
X_train, X_Val, y_train, y_Val = train_test_split(X, y, test_size=0.2, random_state=10)
print("Number of train Samples:", len(X_train))
print("Number of val Samples:", len(X_Val))
return X_train, self.X_test, y_train, self.y_test, X_Val, y_Val
def tokenizeAndEmbedding(self,dataframe):
tokenizer,X = self.wordTokenizer(dataframe['Description'])
y = np.asarray(dataframe['Assignment group'])
X = pad_sequences(X, maxlen = maxlen)
self.embedding_matrix = np.zeros((numWords+1, 100))
for i,word in tokenizer.index_word.items():
if i<numWords+1:
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
self.embedding_matrix[i] = embedding_vector
return X,y
def train(self, dataframe, batch_size, epochs):
X,y = self.tokenizeAndEmbedding(dataframe)
X_train, _, y_train, _, X_Val, y_Val = self.splitData(X,y)
model_history = self.fitModel(X_train,y_train,X_Val,y_Val,batch_size, epochs)
return model_history
def fitModel(self,X_train,y_train,X_Val,y_Val,batch_size, epochs):
input_layer = Input(shape=(maxlen,),dtype=tf.int64)
embed = Embedding(numWords+1,output_dim=100,input_length=maxlen,weights=[self.embedding_matrix], trainable=True)(input_layer) #weights=[embedding_matrix]
gru=GRU(128)(embed)
drop=Dropout(0.3)(gru)
dense =Dense(100,activation='relu')(drop)
out=Dense(len((pd.Series(y_train)).unique()),activation='softmax')(dense)
self.model = Model(input_layer,out)
self.model.compile(loss='sparse_categorical_crossentropy',optimizer="adam",metrics=['accuracy'])
# self.model.summary()
# plot_model(self.model,to_file="GRU.jpg")
checkpoint = ModelCheckpoint('model-{epoch:03d}-{val_accuracy:03f}.h5', verbose=1, monitor='val_accuracy',save_best_only=True, mode='auto')
reduceLoss = ReduceLROnPlateau(monitor='val_loss', factor=0.2,patience=2, min_lr=0.0001)
model_history = self.model.fit(X_train,y_train,batch_size=batch_size, epochs=epochs, callbacks=[checkpoint,reduceLoss], validation_data=(X_Val,y_Val))
return model_history,self.model
def prediction(self):
pred = self.model.predict(self.X_test)
pred = [i.argmax() for i in pred]
accuracy=metrics.accuracy_score(self.y_test, pred)
print("Accuracy of the model :",accuracy)
return accuracy
def plotModelAccuracy(self, history, modelname):
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title(modelname+' model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title(modelname+' model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()
def plotModel(self):
self.model.summary()
# Check how the GRU Model perform with the cleansed data
# Check how the LSTM Model perform with the cleansed data
gruModelRawData = GruGloveModel()
gruModelRawData_history, model = gruModelRawData.train(itTicketDF,100,epochs)
gruRaw_accuracy = gruModelRawData.prediction()
gruModelRawData.plotModel()
gruModelRawData.plotModelAccuracy(gruModelRawData_history, 'All Data Unsampled GRU')
# Check how the GRU Model perform with all the data which is cleansed & resampled to 661 to make the target balance
gruModelAllDataResampled = GruGloveModel()
gruModelAllDataResampled_history, model = gruModelAllDataResampled.train(itTicketDF_resampled,100,epochs)
gruResampled_accuracy = gruModelAllDataResampled.prediction()
gruModelAllDataResampled.plotModelAccuracy(gruModelAllDataResampled_history, 'All Data Resampled GRU')
results=captureData(results,gruModelRawData_history,'GRU model_GloVe_rawdata','GRU+GloVe Embedding on raw data','7',0)
pred_results= capturePrediction(pred_results,'GRU model_GloVe_rawdata','GRU+GloVe Embedding on raw data','6',gruRaw_accuracy,0)
results=captureData(results,gruModelAllDataResampled_history,'GRU model_GloVe_resampled data','GRU+GloVe Embedding on Augmented data','8',0)
pred_results= capturePrediction(pred_results,'GRU model_GloVe_resampled data','GRU+GloVe Embedding on Augmented data','7',gruResampled_accuracy,0)
class RNNGloveModel:
X_test=[]
y_test=[]
embedding_matrix=[]
def wordTokenizer(self, dataframe):
tokenizer = Tokenizer(num_words=numWords,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=' ', char_level=False)
tokenizer.fit_on_texts(dataframe)
dataframe = tokenizer.texts_to_sequences(dataframe)
return tokenizer,dataframe
def tokenizeAndEmbedding(self,dataframe):
tokenizer,X = self.wordTokenizer(dataframe['Description'])
y = np.asarray(dataframe['Assignment group'])
X = pad_sequences(X, maxlen = maxlen)
self.embedding_matrix = np.zeros((numWords+1, 100))
for i,word in tokenizer.index_word.items():
if i<numWords+1:
embedding_vector = embeddings_glove.get(word)
if embedding_vector is not None:
self.embedding_matrix[i] = embedding_vector
return X,y
def splitData(self,X,y):
print("Number of Samples:", len(X))
print("Number of Labels: ", len(y))
X_train, self.X_test, y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=10) # changed by Abraham
X_train, X_Val, y_train, y_Val = train_test_split(X, y, test_size=0.2, random_state=10)
print("Number of train Samples:", len(X_train))
print("Number of val Samples:", len(X_Val))
return X_train, self.X_test, y_train, self.y_test, X_Val, y_Val
def train(self, dataframe, batch_size, epochs):
X,y = self.tokenizeAndEmbedding(dataframe)
X_train, _, y_train, _, X_Val, y_Val = self.splitData(X,y)
embed = Embedding(numWords+1,output_dim=100,input_length=maxlen,weights=[self.embedding_matrix], trainable=True)
model=Sequential()
model.add(Input(shape=(maxlen,),dtype=tf.int64))
model.add(embed)
model.add(Conv1D(100,10,activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))
model.add(Conv1D(100,10,activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.3))
model.add(Dense(100,activation='relu'))
model.add(Dense(len((pd.Series(y_train)).unique()),activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy',optimizer="adam",metrics=['accuracy'])
model.summary()
plot_model(model,to_file="RNN.jpg")
checkpoint = ModelCheckpoint('model-{epoch:03d}-{val_accuracy:03f}.h5', verbose=1, monitor='val_accuracy',save_best_only=True, mode='auto')
reduceLoss = ReduceLROnPlateau(monitor='val_loss', factor=0.2,patience=2, min_lr=0.0001)
model_history = model.fit(X_train,y_train,batch_size=batch_size, epochs=epochs, callbacks=[checkpoint,reduceLoss], validation_data=(X_Val, y_Val))
return model_history, model
def prediction(self,model):
pred = model.predict(self.X_test)
pred = [i.argmax() for i in pred]
accuracy = metrics.accuracy_score(self.y_test, pred)
print("Accuracy of the model :",accuracy)
return accuracy
def plotModelAccuracy(self, history, modelname):
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title(modelname+' model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title(modelname+' model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()
# Check how the RNN Model perform with the cleansed data
RNNModelRawData = RNNGloveModel()
RNNModelRawData_history, RnnModel = RNNModelRawData.train(itTicketDF,100,epochs)
RNNModelRawData_accuracy = RNNModelRawData.prediction(RnnModel)
RNNModelRawData.plotModelAccuracy(RNNModelRawData_history, 'All Data Unsampled RNN')
#Check how the RNN Model perform with all the data which is cleansed & resampled to 661 to make the target balance
RNNModelAllDataResampled = RNNGloveModel()
RNNModelAllDataResampled_history, RnnModel = RNNModelAllDataResampled.train(itTicketDF_resampled,100,epochs)
RNNModelResampled_accuracy = RNNModelAllDataResampled.prediction(RnnModel)
RNNModelAllDataResampled.plotModelAccuracy(RNNModelAllDataResampled_history, 'All Data Resampled RNN')
# results=pd.DataFrame()
results=captureData(results,RNNModelRawData_history,'RNN model_GloVe_rawdata','RNN+GloVe Embedding on raw data','9',0)
pred_results= capturePrediction(pred_results,'RNN model_GloVe_rawdata','RNN+GloVe Embedding on raw data','8',RNNModelRawData_accuracy,0)
results=captureData(results,RNNModelAllDataResampled_history,'RNN model_GloVe_resampled data','RNN+GloVe Embedding on Augmented data','10',0)
pred_results= capturePrediction(pred_results,'RNN model_GloVe_resampled data','RNN+GloVe Embedding on Augmented data','9',RNNModelResampled_accuracy,0)
results
pred_results
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
def wordTokenizer(dataframe):
tokenizer = Tokenizer(num_words=numWords,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=' ', char_level=False)
tokenizer.fit_on_texts(dataframe)
dataframe = tokenizer.texts_to_sequences(dataframe)
return tokenizer,dataframe
maxlen = 150
numWords = 9000
tokenizer,X = wordTokenizer(itTicketDF['Description'])
y = np.asarray(itTicketDF['Assignment group'])
X = pad_sequences(X, maxlen = maxlen)
print("Number of Samples:", len(X))
print("Number of Labels: ", len(y))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
print("Number of train Samples:", len(X_train))
print("Number of val Samples:", len(X_test))
rawClf=RandomForestClassifier(n_estimators=100)
rawClf.fit(X_train,y_train)
y_pred=rawClf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
#Store the accuracy results for each model in a dataframe for final comparison
results_ml=pd.DataFrame()
tempResultsDf_ml = pd.DataFrame({'Model':['Random Forest'], 'accuracy': [metrics.accuracy_score(y_test, y_pred)]},index={'1'})
results_ml = pd.concat([results_ml, tempResultsDf_ml])
results_ml
Cs = [0.01, 0.1, 1, 10]
gammas = [0.001, 0.01, 0.1, 1]
param_grid = {'C': Cs, 'gamma' : gammas}
svc = svm.SVC(kernel='rbf')
grid_search = GridSearchCV(svc, param_grid)
grid_search.fit(X_train,y_train)
clf = svm.SVC(**grid_search.best_params_)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)
#Store the accuracy results for each model in a dataframe for final comparison
tempResultsDf_ml = pd.DataFrame({'Model':['SVM Classifier'], 'accuracy': [metrics.accuracy_score(y_test, y_pred)]},index={'2'})
results_ml = pd.concat([results_ml, tempResultsDf_ml])
results_ml